0.1 Variable descriptions

0.2 Read in datasets

Language codes (from WALS) – WALS, ISO, ascii-name. These are used to merge across datasets.

codes = read.csv("../data/language_codes.csv") %>%
  select(language, WALS, ISO) %>%
  mutate(ISO = unlist(lapply(strsplit(as.character(ISO),
                                      ","),function(x) x[1])))
# in cases where there are two ISO codes, takes first

Lupyan & Dale (2010): Demographic variables and syntactic compelxity

ld = read.table("../data/lupyan_2010.txt", fill = T, 
               header = T, sep = "\t", na.strings = "*") %>%
    left_join(codes, c("walsCode" = "WALS")) %>%
    rename(pop_log = logpop2, 
           mean.temp = aveTemp,
           n.neighbors = numNeighbors,
           sum.precip = sumPrecip,
           sd.temp = sdTemp,
           sd.precip = sdPrecip,
           lang.family = langFamily,
           lang.genus = langGenus,
           native.country = nativeCountry,
           native.country.area = nativeCountryArea,
           growing.season = growingSeason)

# get means across language
ld.demo = ld %>%
     group_by(ISO) %>%
     summarise_each(funs(mean(., na.rm = TRUE)), 
                    c(8:9, 16:121))  %>%
     select(1:3, 93:95, 100:101, 103:105, 108)

# add in data family
fams = ld %>%
      group_by(ISO) %>%
      filter(row_number() == 1) %>%
      select(ISO, lang.family, lang.genus, native.country,
             native.country.area, language)  

# we're getting country here from choosing the most frequent country for a language
fams.country = ld %>%
        group_by(ISO) %>%
        summarise(country = most.frequent.level(country)) %>%
        left_join(fams)

ld.demo = left_join(fams.country, ld.demo, by = "ISO") %>%
          ungroup()

WALS features of complexity

# get variables to include 
qual = ld %>%
  select(18:103, 124) %>%
  mutate_each(funs(as.factor)) %>%
  group_by(ISO) %>%
  summarise_each(funs(most.frequent.level)) %>%
  mutate_each(funs(as.factor)) 

ld_feature_names = read.csv("../data/lupyan_2010_all_feature_mappings.csv") 
qualVarNames = intersect(names(qual), ld_feature_names$WALS.feature.name)
qual = select(qual,which(is.element(names(qual), c("ISO", qualVarNames))))

# remap factor levels to complexity values (0,1)
# note two variables that are reported in the paper (NICPOC[59] and CYSIND[39] are missing in the data)

for (i in 1:length(qualVarNames)){
  thisVarLabs = ld_feature_names[ld_feature_names$WALS.feature.name == qualVarNames[i],]
  old = thisVarLabs$ld.level.label
  new = thisVarLabs$ld.complexity
  col_i = grep(qualVarNames[i], colnames(qual))
  qual[,col_i] = mapvalues(as.matrix(qual[,col_i]), 
                           from = as.character(old),
                           to = as.character(new), warn_missing = TRUE)
}

ld.complexity = qual %>%
  mutate_each(funs(as.numeric), -ISO) %>%
  gather(variable, complexity.level, 2:28) %>%
  group_by(ISO) %>%
  summarise(morphological.complexity = sum(complexity.level))

ld.demo.qual = left_join(ld.demo, ld.complexity)

Bentz, et al. (2015): L2 learners and lexical diversity Note here that we are only looking at languages from the UDHR corpus. The other corpora have smaller languages only.

bentz = read.csv("../data/bentz_2015.csv") %>%
    gather(temp, LDT, starts_with("LDT")) %>% 
    unite(temp1, measure, temp, sep = ".") %>% 
    spread(temp1, LDT) %>%
    filter(text == "UDHR") %>%
    select(iso_639_3, RatioL2, a.LDTscaled, 
           H.LDTscaled, TTR.LDTscaled, Stock,
           Region, L1_speakers, L2_speakers) %>%
    rename(ISO = iso_639_3,
           n.L1.speakers = L1_speakers,
           n.L2.speakers = L2_speakers,
           ratio.L2.L1 = RatioL2,
           stock = Stock,
           region = Region,
           scaled.LDT.TTR = TTR.LDTscaled,
           scaled.LDT.ZM = a.LDTscaled,
           scaled.LDT.H = H.LDTscaled) %>%
    left_join(codes, by="ISO") %>%
    mutate(ISO = as.factor(ISO)) %>%
    select(-language, -WALS) 

# these were taken directly from ethnologue website by ML for missing languages that were high-frequency across the datasets
ethno_sup = read.csv("../data/supplementary_enthnologue.csv")
bentz = full_join(bentz, ethno_sup)

Atkinson (2011)

atkinson = read.csv("../data/atkinson_2011.csv") %>%
  select(normalized.vowel.diversity,
         normalized.consonant.diversity,    
         normalized.tone.diversity,
         normalized.phoneme.diversity, ISO, 
         distance.from.origin) %>%
  mutate(ISO = unlist(lapply(strsplit(as.character(ISO),
                                      " "),function(x) x[1]))) %>%
  left_join(codes, by="ISO") %>%
  select(-language, -WALS)

Lewis & Frank (under review): Complexity Bias

# complexity bias
cb = read.csv("../data/lewis_2015.csv") %>%
      rename(complexity.bias = corr,
              p.complexity.bias = p.corr,
              mono.complexity.bias = mono.cor,
              open.complexity.bias = open.cor) %>%
      left_join(codes, by="language") %>%
    select(-X.1, -X, -lower.ci, -upper.ci, -checked,
           -mean.length) %>%
    distinct(ISO) %>%
   filter(language != "english") %>% # english is an outlier in this dataset because norms were colelcted in english
  select(-language, -WALS)

Futrell, Mahowald, & Gibson (2015): Dependency length

dl = read.csv("../data/futrell_2015.csv") %>%
  left_join(codes, by="language") %>%
  select(-language, -WALS, -fixed.random.baseline.slope, 
         -observed.slope, -m_rand) %>%
  rename(mean.dependency.length = m_obs)

Pellegrino, Coupe, & Marsico (2015): Information density

uid = read.csv("../data/pellegrino_2015.csv")  %>%
  left_join(codes, by="language") %>%
  select(-language, -WALS)

Moran, McCloy, & Wright (2012): Phonemic inventory

phoneme = read.csv("../data/moran_2012.csv") %>%
          select(ISO, pho, con, vow, son, 
                 obs, mon, qua, ton) %>%
          rename(n.phonemes = pho,
                 n.consonants = con,
                 n.vowels = vow,
                 n.sonorants = son,
                 n.obstruents = obs,
                 n.monophthongs = mon,
                 n.qual.monophthongs = qua,
                 n.tones = ton)

Wichmann, et al. (2013): Mean word length

# note is this IPA? - fix this!
ml = read.csv("../data/wichmann_2013.csv") %>% 
  select(1,1:109) %>%
  gather(word,translation,I:name) %>%
  mutate(nchar = unlist(
    lapply(
      lapply(
        strsplit(
          gsub("[[:space:]]", "", translation) ,
          ","), 
           nchar), mean))) %>%
  filter(translation != "")

# subset to only those words in the swadesh list (n = 40)
swadesh.words  = ml[ml$ISO == "gwj", "word"] 
ml = ml %>%
      filter(is.element(word, swadesh.words))

ml.d = ml %>%
        group_by(ISO) %>%
        summarize(mean.length = mean(nchar, na.rm = T)) 

Luniewska, et al. (2015): Age of acquistion (Aoa)

aoa.raw = read.csv("../data/luniewska_2015.csv", 
                       header = T, fileEncoding = "latin1")
  
aoa = aoa.raw %>%
  gather(language_aoa, aoa, grep("aoa", 
                                 names(aoa.raw))) %>%
  mutate(language_aoa = tolower(unlist(lapply(strsplit(
    as.character(language_aoa),"_"), function(x) x[1])))) %>%
  select(-3:-27) %>%
  rename(language = language_aoa) %>%
  left_join(codes, by="language") %>%
  mutate(language = as.factor(language)) %>%
  group_by(ISO) %>%
  summarize(mean.aoa = mean(aoa, na.rm = T))

Merge data frames together by ISO

d = full_join(ld.demo.qual, cb, by = "ISO") %>%
    full_join(bentz, by = "ISO") %>%
    full_join(dl, by = "ISO") %>%
    full_join(uid, by = "ISO") %>%
    full_join(phoneme, by = "ISO") %>%
    full_join(ml.d, by = "ISO") %>%
    full_join(aoa, by = "ISO") %>%
    full_join(atkinson, by = "ISO") %>%
    distinct(ISO) %>%
    filter(!is.na(ISO), ISO !="") %>%
    mutate(ISO = as.factor(ISO))

d$pop_log= ifelse(is.na(d$pop_log), log(d$n.L1.speakers), d$pop_log) # uses bentz values where missing from WALS
d$ratio.L2.L1= ifelse(is.na(d$ratio.L2.L1), d$n.L2.speakers/d$n.L1.speakers, d$ratio.L2.L1) 

#write.csv(d, "../data/langLearnVar_data.csv")

load data

d = read.csv("../data/langLearnVar_data.csv")[,-1]

0.3 Data cleaning

Remove outliers and check for normality

d.clean = d %>%
  mutate_each(funs(removeOutlier(.,N_EXCLUDE_SDS)), c(-ISO, -lang.family, 
              -native.country, -native.country.area, 
         -lang.genus, -language, -stock, -region, -country))

d.clean %>%
  select(-ISO, -lang.family, -native.country, -native.country.area, 
         -lang.genus, -language, -stock, -region, -country) %>%
  gather(var, value) %>%
  ggplot(aes(sample = value)) + 
  stat_qq(na.rm = T, size = .2) + 
  facet_wrap( ~ var, scales = "free") +
  theme_bw()

The following variables look right-skewed.

NN_vars = c("area", "perimeter", "n.neighbors", "sd.precip",
           "ratio.L2.L1", "n.L1.speakers", "n.L2.speakers",
           "n.phonemes", "n.consonants","n.vowels", "n.sonorants",
           "n.obstruents", "n.monophthongs", "n.qual.monophthongs",
           "n.tones")

Take the log of these variables, and check for normality again.

d.clean = d.clean %>%
  mutate_each(funs(log = log(.), dummy =  sum(.)),
              one_of(NN_vars)) %>%
  select(-contains("dummy")) %>%
  select(-one_of(NN_vars))

d.clean %>%
  select(-ISO, -lang.family, -native.country, -native.country.area, 
         -lang.genus, -language, -stock, -region, -country) %>%
  gather(var, value) %>%
  ggplot(aes(sample = value)) + 
  stat_qq(na.rm = T, size = .2) + 
  facet_wrap( ~ var, scales = "free") +
  theme_bw()

Check for normality of variables using Shapiro-Wilk Normality Test

is.na(d.clean) <- sapply(d.clean, is.infinite)

normality.test = d.clean %>%
  summarise_each(funs(unlist(tidy(shapiro.test(.))[2])),
                 c(-ISO, -lang.family, -native.country,
                   -native.country.area, -country,
                     -lang.genus, -language, -stock, -region)) %>%
  gather(variable, shapiro.p.value) %>%
  mutate(normal = ifelse(shapiro.p.value > .05, TRUE, FALSE)) %>%
  summary()

By this test, only 11 variables normal. But, I think this is the best we can do (?).

Look at histograms of all variables.

demo_vars = c("lat", "lon", "perimeter_log", "n.neighbors_log",
              "mean.temp", "sum.precip",
              "sd.temp", "growing.season", "pop_log", "area_log",
              "n.neighbors_log", "sd.precip_log", "ratio.L2.L1_log",
              "n.L1.speakers_log",
              "n.L2.speakers_log", "distance.from.origin")

lang_vars = c(setdiff(names(d.clean), demo_vars))[-c(1:6, 16,17)]

d.clean %>%
  gather(variable, num, lat:length(d.clean))  %>%
  filter(variable != "stock", variable != "region") %>%
  mutate(var_type = ifelse(is.element(variable, demo_vars),
                           "demo", "lang"),
         num = as.numeric(as.character(num))) %>%
  arrange(var_type) %>% 
  mutate(variable = factor(variable, variable)) %>%
  ggplot(aes(x = num, fill = var_type)) + 
    geom_histogram(position = "identity") +
    facet_wrap( ~ variable, scales = "free") + 
    theme_bw()

Number of languages we have data, for each variable

d.clean %>%
  gather(variable, num, lat:length(d.clean))  %>%
  filter(variable != "stock", variable != "region") %>%
  group_by(variable) %>%
  summarize(counts = length(which(is.na(num) == F)))  %>%
  mutate(var_type = ifelse(is.element(variable, demo_vars),
                           "demo", "lang")) %>%
  mutate(counts_trunc = ifelse(counts > 150, 150, counts))  %>%
  arrange(var_type) %>% 
  mutate(variable = factor(variable, variable)) %>%
  ggplot(aes(y=counts_trunc, x = variable, fill = var_type)) + 
    geom_bar(stat = "identity") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.position = "none") +
    ylab("Number of languages (truncated at 150)") +
    ggtitle("Number languages per measure")

Look at number of languages intersecting each variable

arealControl_vars = c("stock", "region", "lang.family", "lang.genus",
                      "native.country", "native.country.area",
                      "country")

d.clean %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), c(lang_vars, arealControl_vars)))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars))) %>%
  group_by(lang_measure, pop_measure) %>%
  mutate(both_not_na = !is.na(lang_value) & !is.na(pop_value)) %>%
  summarise(n_languages = length(which(both_not_na == TRUE))) %>%
  ggplot(aes(pop_measure, lang_measure, group = lang_measure)) +
      geom_tile(aes(fill = n_languages)) + 
      geom_text(aes(fill = n_languages, label = n_languages)) +
      scale_fill_gradient(low = "white", high = "red")  +
      xlab("population variables") + 
      ylab("language variables") + 
      ggtitle("Number languages between variables") +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))

0.4 Relationship between language and demographic variables

Fits controling for family – controlling for family in a mixed effect model

# these are the variables we care about for the paper
demo_vars_crit = c("n.neighbors_log",
              "mean.temp", "sum.precip", "sd.temp",
              "pop_log", "area_log", "n.neighbors_log",
              "sd.precip_log", "ratio.L2.L1_log",
              "distance.from.origin")

lang_vars_crit = c("p.complexity.bias","scaled.LDT.TTR",
                 "mean.dependency.length",
              "mean.length", "mean.aoa",
              "n.consonants_log", "n.vowels_log", "morphological.complexity",
              "information.density", "information.rate")

d.scatter.fam = d.clean %>% 
  select(ratio.L2.L1_log, pop_log, distance.from.origin,
         n.neighbors_log, area_log, mean.temp, sd.temp, sum.precip,
         sd.precip_log, n.consonants_log, n.vowels_log, mean.length,
         p.complexity.bias, scaled.LDT.TTR, morphological.complexity,
         mean.aoa, lang.family, country) %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), lang_vars_crit))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars_crit)))

is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite) 
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
                        !is.na(lang_value))

# get model fits
d.model.fits = d.scatter.fam %>%
  group_by(lang_measure, pop_measure) %>%
  do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) +
                 (1|country), data=.))) %>%
  filter(term == "pop_value") %>%
  mutate(sig = ifelse(abs(statistic) > 1.96, "*", "")) %>%
  mutate(sig.col = ifelse(statistic > 1.96, "pos",
                          ifelse(statistic < -1.96, "neg",
                                 "none"))) %>%
  mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
  ungroup

d.model.fits[d.model.fits == "NaN"] = "NA"

# plot
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
  geom_rect(data = d.model.fits, aes(fill = sig.col), 
          xmin = -Inf, xmax = Inf,
          ymin = -Inf, ymax = Inf, alpha = 0.2) +
  geom_point(size = .3) +
  geom_smooth(method = "lm", color = "green") +
  facet_grid(lang_measure ~ pop_measure, scales = "free") +
  scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
  theme_bw() +
  xlab("Demographic variables") +
  ylab("Language variables") +
  theme(legend.position = "none")

0.5 Relationship between language and demographic variables, with PCA

# get demographic variables only (excluding L2 because intersections contains only 40 languages)
d.demo =  d.clean %>%
  select(lat, mean.temp, sd.temp, sum.precip, sd.precip_log, 
         area_log, pop_log, n.neighbors_log) %>% #
  mutate(lat = abs(lat))
is.na(d.demo) <- sapply(d.demo, is.infinite) 

# do pca
pca.demo = prcomp(na.omit(d.demo), scale = TRUE)
barplot(pca.demo$sdev/pca.demo$sdev[1])

# look at varence explained
fviz_eig(pca.demo, addlabels=TRUE, 
               linecolor ="red", geom = "line") +
  theme_bw()

#pca.demo = prcomp(na.omit(d.demo), scale = TRUE, tol = .45)

# plot loadings
pcs.demo <- cbind(names(d.demo),
                    gather(as.data.frame(pca.demo$rotation), 
                           pc, value))
names(pcs.demo)[1] = "variable"
 
pcs.demo$variable =  factor(pcs.demo$variable , levels = pcs.demo$variable[1:8]) # order variables
pcs.demo$magnitude = ifelse(abs(pcs.demo$value)>.2, TRUE, FALSE)
ggplot(pcs.demo) +
  geom_bar(aes(x = variable, y = value,
               fill = magnitude), stat="identity") +
  facet_wrap(~pc) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  + 
       scale_fill_manual(name = "Magnitude", 
                       values = c("grey","red1")) +
  ggtitle("PCA loadings")

# merge PCAs with full dataset for model fits
d.demo[!is.na(d.demo)] = 0
d.demo[is.na(d.demo)] = 1
good_is = which(rowSums(d.demo) == 0)
d.pca.demo = d.clean[good_is,] %>% cbind(pca.demo$x)

names(d.pca.demo)[names(d.pca.demo) == "PC1"] = "cold_and_dry_climate"
names(d.pca.demo)[names(d.pca.demo) == "PC2"] = "big"
names(d.pca.demo)[names(d.pca.demo) == "PC3"] = "cool_and_rainy_moderate_climate"

Principle components by language

ggplot(d.pca.demo) +   
  borders("world", colour="gray50", fill="gray50") +
  geom_point(aes(x = lon, y=lat, 
               color = cold_and_dry_climate), size=3) +
  scale_colour_gradient(high="green", low = "white") +
  ggtitle("Wet and hot (PC1)") + 
  mapTheme

ggplot(d.pca.demo) +   
  borders("world", colour="gray50", fill="gray50") +
  geom_point(aes(x = lon, y=lat, 
               color = big), size=3) +
  scale_colour_gradient(high="blue", low = "white") +
  ggtitle("big (PC2)") + 
  mapTheme

ggplot(d.pca.demo) +   
  borders("world", colour="gray50", fill="gray50") +
  geom_point(aes(x = lon, y=lat, 
               color = cool_and_rainy_moderate_climate), size=3) +
  scale_colour_gradient(high="red", low = "white") +
  ggtitle("Desert climate (PC3)") + 
  mapTheme

Model fits using PCA

demo_vars_crit2 = c("cold_and_dry_climate", "big", "cool_and_rainy_moderate_climate")

lang_vars_crit2 = c("morphological.complexity", "p.complexity.bias","scaled.LDT.TTR",
              "mean.length", "mean.aoa",
              "n.consonants_log", "n.vowels_log")

d.scatter.fam = d.pca.demo %>% 
  select(n.consonants_log, n.vowels_log, 
          mean.length, p.complexity.bias, scaled.LDT.TTR, morphological.complexity, mean.aoa,
         lang.family, country, cold_and_dry_climate, big,
         cool_and_rainy_moderate_climate) %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), lang_vars_crit2))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars_crit2)))

is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite) 
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
                        !is.na(lang_value))

# get model fits
d.model.fits = d.scatter.fam %>%
  group_by(lang_measure, pop_measure) %>%
  do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) + (1|country),
               data=.))) %>%
  filter(term == "pop_value") %>%
  mutate(sig.col = ifelse(statistic > 1.96, "pos",
                          ifelse(statistic < -1.96, "neg",
                                 "none"))) %>%
  mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
  ungroup

d.model.fits[d.model.fits == "NaN"] = "NA"
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
  geom_rect(data = d.model.fits, aes(fill = sig.col), 
        xmin = -Inf, xmax = Inf,
        ymin = -Inf, ymax = Inf, alpha = 0.2) +
  geom_point(size = .3) +
  geom_smooth(method = "lm", color = "green") +
  facet_grid(lang_measure~pop_measure, scales = "free") +
  scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
  theme_bw() +
  xlab("Demographic variables") +
  ylab("Language variables") +
  theme(legend.position = "none")

0.6 Relationship between PC on language variables

d.lang =  d.clean %>%
  select(morphological.complexity, scaled.LDT.H, 
         mean.length, n.consonants_log, n.vowels_log) 
is.na(d.lang) <- sapply(d.lang, is.infinite) 

# do pca
pca.lang = prcomp(na.omit(d.lang), scale = TRUE)

# look at varence explained
fviz_eig(pca.lang, addlabels=TRUE, 
               linecolor ="red", geom = "line") +
  theme_bw()

#pca.lang = prcomp(na.omit(d.lang), scale = TRUE, tol= .55)

# plot loadings
pcs.lang <- cbind(names(d.lang),
                    gather(as.data.frame(pca.lang$rotation), 
                           pc, value))
names(pcs.lang)[1] = "variable"
 
pcs.lang$variable =  factor(pcs.lang$variable , levels = pcs.lang$variable[1:8]) # order variables
pcs.lang$magnitude = ifelse(abs(pcs.lang$value)>.2, TRUE, FALSE)
ggplot(pcs.lang) +
  geom_bar(aes(x = variable, y = value,
               fill = magnitude), stat="identity") +
  facet_wrap(~pc) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  + 
       scale_fill_manual(name = "Magnitude", 
                       values = c("grey","red1")) +
  ggtitle("PCA loadings")

# merge PCAs with full dataset for model fits
d.lang[!is.na(d.lang)] = 0
d.lang[is.na(d.lang)] = 1
good_is = which(rowSums(d.lang) == 0)
d.pca.lang = d.clean[good_is,] %>% cbind(pca.lang$x)

names(d.pca.lang)[names(d.pca.lang) == "PC1"] = "PC1_L"
names(d.pca.lang)[names(d.pca.lang) == "PC2"] = "PC2_L"
names(d.pca.lang)[names(d.pca.lang) == "PC3"] = "PC3_L"

Model fits using PCA - all demographic to predict lang principle components

lang_vars_crit2 = c("PC1_L", "PC2_L", "PC3_L")

demo_vars_crit2 = c("n.neighbors_log",
              "mean.temp", "sum.precip", "sd.temp",
              "pop_log", "area_log", "n.neighbors_log",
              "sd.precip_log", "ratio.L2.L1_log",
              "distance.from.origin")

d.scatter.fam = d.pca.lang %>% 
  select(ratio.L2.L1_log, pop_log, distance.from.origin,
         n.neighbors_log, area_log, mean.temp, sd.temp, sum.precip, PC1_L, PC2_L, PC3_L, lang.family, country) %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), lang_vars_crit2))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars_crit2)))

is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite) 
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
                        !is.na(lang_value))

# get model fits
d.model.fits = d.scatter.fam %>%
  group_by(lang_measure, pop_measure) %>%
  do(tidy(lmer(lang_value ~ pop_value +(1|country), data=.))) %>%
  filter(term == "pop_value") %>%
  mutate(sig.col = ifelse(statistic > 1.96, "pos",
                          ifelse(statistic < -1.96, "neg",
                                 "none"))) %>%
  mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
  ungroup

d.model.fits[d.model.fits == "NaN"] = "NA"
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
  geom_rect(data = d.model.fits, aes(fill = sig.col), 
          xmin = -Inf, xmax = Inf,
          ymin = -Inf, ymax = Inf, alpha = 0.2) +
  geom_point(size = .3) +
  geom_smooth(method = "lm", color = "green") +
  facet_grid(lang_measure~pop_measure, scales = "free") +
  scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
  theme_bw() +
  xlab("Demographic variables") +
  ylab("Language variables") +
  theme(legend.position = "none")

All PCAs

all.pca = left_join(d.pca.demo, d.pca.lang[,c("PC1_L", "PC2_L", "PC3_L", "ISO")],
                    by = "ISO")

demo_vars_crit2 = c("cold_and_dry_climate", "big",
                    "cool_and_rainy_moderate_climate")
lang_vars_crit2 = c("PC1_L", "PC2_L", "PC3_L")

d.scatter.fam = all.pca %>% 
  select(PC1_L, PC2_L, PC3_L, cold_and_dry_climate, big, cool_and_rainy_moderate_climate, lang.family, country, ISO) %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), lang_vars_crit2))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars_crit2)))

is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite) 
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
                        !is.na(lang_value))

# get model fits
d.model.fits = d.scatter.fam %>%
  group_by(lang_measure, pop_measure) %>%
  do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) + (1|country),
               data=.))) %>%
  filter(term == "pop_value") %>%
  mutate(sig.col = ifelse(statistic > 1.96, "pos",
                          ifelse(statistic < -1.96, "neg",
                                 "none"))) %>%
  mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
  ungroup

d.model.fits
## Source: local data frame [9 x 10]
## 
##   lang_measure                     pop_measure      term    estimate
##         (fctr)                          (fctr)     (chr)       (dbl)
## 1        PC1_L            cold_and_dry_climate pop_value -0.21432866
## 2        PC1_L                             big pop_value -0.33828106
## 3        PC1_L cool_and_rainy_moderate_climate pop_value  0.18446301
## 4        PC2_L            cold_and_dry_climate pop_value  0.08160239
## 5        PC2_L                             big pop_value  0.33899999
## 6        PC2_L cool_and_rainy_moderate_climate pop_value  0.24988791
## 7        PC3_L            cold_and_dry_climate pop_value  0.26378869
## 8        PC3_L                             big pop_value  0.22245329
## 9        PC3_L cool_and_rainy_moderate_climate pop_value -0.31329400
## Variables not shown: std.error (dbl), statistic (dbl), group (chr),
##   sig.col (chr), pop_value (dbl), lang_value (dbl)
d.model.fits[d.model.fits == "NaN"] = "NA"
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
        geom_rect(data = d.model.fits, aes(fill = sig.col), 
                xmin = -Inf, xmax = Inf,
                ymin = -Inf, ymax = Inf, alpha = 0.2) +
      geom_point(size = .3) +
      geom_smooth(method = "lm", color = "green") +
      facet_grid(lang_measure~pop_measure, scales = "free") +
     #scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
    theme_bw() +
    xlab("Demographic variables") +
    ylab("Language variables") +
    theme(legend.position = "none")

0.7 Replicate Lupyan and Dale (2010) complexity score plot (Fig. 3)

ld.demo.qual %>%
  mutate(morphological.complexity.factor = as.factor(morphological.complexity)) %>%
  group_by(morphological.complexity.factor) %>%
  multi_boot(column="pop_log",
              summary_groups = c("morphological.complexity.factor"),
              statistics_functions = c("mean", "ci_lower","ci_upper")) %>%
  mutate(morphological.complexity =  
           as.numeric(as.character(morphological.complexity.factor))) %>%
  ggplot() +
  geom_pointrange(aes(x= morphological.complexity.factor,
                      y = mean, ymin = ci_lower, ymax = ci_upper)) + 
  geom_smooth(method = "lm",aes(morphological.complexity-5, mean)) +
  # theres a weird offset because complexity is a factor?
  ylab("Log population") + 
  xlab("Morphological complexity") +
  ggtitle("Lupyan and Dale (2010) Fig. 3 reproduction") +
  theme_bw() 

0.8 Relationship between language variables for AOA

# predicted relationship with AOA  with TTR and number of consonants

tidy(lmer(mean.aoa ~ morphological.complexity + (morphological.complexity|lang.family) + (1|country), d.clean))[2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$morphological.complexity)

tidy(lmer(mean.aoa ~ p.complexity.bias + (p.complexity.bias|lang.family) + (1|country), d.clean))[2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$p.complexity.bias)

# **
tidy(lmer(mean.aoa ~ scaled.LDT.TTR + (scaled.LDT.TTR|lang.family) + (1|country), d.clean))[2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$scaled.LDT.TTR)
cor.test(d.clean$mean.aoa, d.clean$scaled.LDT.H)
cor.test(d.clean$mean.aoa, d.clean$scaled.LDT.ZM)

tidy(lmer(mean.aoa ~ mean.dependency.length + (mean.dependency.length|lang.family) + (1|country), d.clean))[2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$mean.dependency.length)

tidy(lmer(mean.aoa ~ mean.length + (mean.length|lang.family) + (1|country), d.clean)) [2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$mean.length)

# **
tidy(lmer(mean.aoa ~ n.consonants_log + (n.consonants_log|lang.family) + (1|country) , d.clean))[2, "statistic"]
tidy(lmer(mean.aoa ~ normalized.consonant.diversity  + (1|country) , d.clean)) [2, "statistic"]

tidy(lmer(mean.aoa ~ n.phonemes_log + (n.phonemes_log|lang.family) + (1|country) , d.clean))[2, "statistic"]

tidy(lmer(mean.aoa ~ n.vowels_log + (n.vowels_log|lang.family) + (1|country) , d.clean))[2, "statistic"]

tidy(lmer(mean.aoa ~ syllable.rate  + (1|country) , d.clean))[2, "statistic"]

tidy(lmer(mean.aoa ~ information.density  + (1|country) , d.clean))[2, "statistic"]

tidy(lmer(mean.aoa ~ information.rate  + (1|country) , d.clean)) [2, "statistic"]

ggplot(d.clean, aes(mean.length, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(scaled.LDT.TTR, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(n.consonants_log, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(n.phonemes_log, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(n.vowels_log, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(mean.dependency.length , mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")